Assignment No. 2¶

Lecturer: Professor Yitzhak Lapidot¶

Gal Neumann - 316120260 & Nizan Ismailov - 208412569¶

A Comparison of Artificial Neural Network and Decision Trees with Logistic Regression as Classification Models for Breast Cancer Survival¶

The data set was import from: NIH - National Cancer Institute (SEER)

Imports¶

In [3]:
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, f1_score, precision_score, recall_score, accuracy_score
from sklearn.tree import export_text
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from sklearn.tree import plot_tree
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, train_test_split, cross_validate, cross_val_score
from scipy.stats import randint
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, roc_curve, confusion_matrix, accuracy_score
from IPython.display import display

Load the data¶

In [4]:
data = pd.read_csv('data.csv')
print(data.head())
  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0         M        17.99         10.38          122.80     1001.0   
1         M        20.57         17.77          132.90     1326.0   
2         M        19.69         21.25          130.00     1203.0   
3         M        11.42         20.38           77.58      386.1   
4         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   symmetry_mean  ...  concavity_worst  concave points_worst  symmetry_worst  \
0         0.2419  ...           0.7119                0.2654          0.4601   
1         0.1812  ...           0.2416                0.1860          0.2750   
2         0.2069  ...           0.4504                0.2430          0.3613   
3         0.2597  ...           0.6869                0.2575          0.6638   
4         0.1809  ...           0.4000                0.1625          0.2364   

   fractal_dimension_worst  age  stage_of_cancer  treatment_administered  \
0                  0.11890   74          Stage 1               Radiation   
1                  0.08902   77          Stage 1                 Surgery   
2                  0.08758   83          Stage 4                Hormonal   
3                  0.17300   30          Stage 3               Radiation   
4                  0.07678   33          Stage 2                Hormonal   

   duration  censor  survival_probability  
0        20       1              0.829205  
1        67       0              0.780743  
2        93       0              0.507499  
3        38       1              0.480011  
4        62       1              0.485004  

[5 rows x 37 columns]

Data PreProcessing¶

Taking care of missing data¶

In [5]:
data.replace('?', pd.NA, inplace=True)
data.dropna(inplace=True)
data.shape
data.describe()
Out[5]:
(569, 37)
Out[5]:
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst age duration censor survival_probability
count 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 ... 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000
mean 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.048919 0.181162 0.062798 ... 0.132369 0.254265 0.272188 0.114606 0.290076 0.083946 59.641476 63.465729 0.794376 0.573025
std 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.038803 0.027414 0.007060 ... 0.022832 0.157336 0.208624 0.065732 0.061867 0.018061 17.749472 32.925748 0.404512 0.143626
min 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 0.106000 0.049960 ... 0.071170 0.027290 0.000000 0.000000 0.156500 0.055040 30.000000 6.000000 0.000000 0.204240
25% 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.020310 0.161900 0.057700 ... 0.116600 0.147200 0.114500 0.064930 0.250400 0.071460 45.000000 34.000000 1.000000 0.467546
50% 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.033500 0.179200 0.061540 ... 0.131300 0.211900 0.226700 0.099930 0.282200 0.080040 60.000000 63.000000 1.000000 0.576885
75% 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.074000 0.195700 0.066120 ... 0.146000 0.339100 0.382900 0.161400 0.317900 0.092080 75.000000 92.000000 1.000000 0.689974
max 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.201200 0.304000 0.097440 ... 0.222600 1.058000 1.252000 0.291000 0.663800 0.207500 90.000000 120.000000 1.000000 0.866458

8 rows × 34 columns

As we can see there are no missing values, "SEER" datasets shoul'd be clean and without missing values.

Change the target (Survival_probability) to binary terms and remove "survival_probability" and "censor" from data¶

Acording to the article, the goal is to predict whether a patient survived or not.

"1" = Survived , "0" = Didn't Survived

In [6]:
data['target'] = (data['survival_probability'] > 0.5).astype(int)
Binary_data = data.copy()

data = data.drop('survival_probability', axis=1)
data = data.drop('censor', axis=1)
data.to_csv('data_01.csv', index=False)

data.head()
Out[6]:
diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean ... compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst age stage_of_cancer treatment_administered duration target
0 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 ... 0.6656 0.7119 0.2654 0.4601 0.11890 74 Stage 1 Radiation 20 1
1 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 ... 0.1866 0.2416 0.1860 0.2750 0.08902 77 Stage 1 Surgery 67 1
2 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 ... 0.4245 0.4504 0.2430 0.3613 0.08758 83 Stage 4 Hormonal 93 1
3 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 0.2597 ... 0.8663 0.6869 0.2575 0.6638 0.17300 30 Stage 3 Radiation 38 0
4 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 0.1809 ... 0.2050 0.4000 0.1625 0.2364 0.07678 33 Stage 2 Hormonal 62 0

5 rows × 36 columns

Now we will split the data into 80% train and 20% test.¶

We made sure to preserve target variable distribution using Stratification Parameter.

In [7]:
X = data.drop(columns=['target'])
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

print(f"Stratification of train set: {y_train.mean()}")
print(f"Stratification of test set: {y_test.mean()}")
Stratification of train set: 0.6659340659340659
Stratification of test set: 0.6666666666666666

Exports the sets

In [8]:
train_data.to_csv('train_data.csv')
test_data.to_csv('test_data.csv')

Adjust the data into Categorical and Numerical datatypes¶

In [9]:
objectAttributesKey = ["diagnosis", "stage_of_cancer", "treatment_administered"]
integerAttributesKey = ["age"]

data[objectAttributesKey] = data[objectAttributesKey].astype('category')
data[integerAttributesKey] = data[integerAttributesKey].astype(int)

data = pd.get_dummies(data, columns=objectAttributesKey, drop_first=True)

data.to_csv('train_data_01.csv', index=False)
In [10]:
data.head()
Out[10]:
radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean fractal_dimension_mean ... duration target diagnosis_M stage_of_cancer_Stage 2 stage_of_cancer_Stage 3 stage_of_cancer_Stage 4 treatment_administered_Combination treatment_administered_Hormonal treatment_administered_Radiation treatment_administered_Surgery
0 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 0.2419 0.07871 ... 20 1 True False False False False False True False
1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 0.1812 0.05667 ... 67 1 True False False False False False False True
2 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 0.2069 0.05999 ... 93 1 True False False True False True False False
3 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 0.2597 0.09744 ... 38 0 True False True False False False True False
4 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 0.1809 0.05883 ... 62 0 True True False False False True False False

5 rows × 41 columns

We will check for the distribution of 'target'

In [11]:
train_data['target'].value_counts()
Out[11]:
count
target
1 303
0 152

Identify outliers¶

we'll calculate the 1.5 times the Interquartile Range (IQR) and visualize them using boxplots.

In [12]:
train_data['target'] = train_data['target'].map({1: 'Survived', 0: "Didn't Survived"})

numerical_columns = train_data.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = [col for col in numerical_columns if col != 'target']

sns.set(rc={'figure.figsize': (6, 3)})
sns.set_context("talk")

for column in numerical_columns:
  plt.figure(figsize=(6, 3))
  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")

  plt.title(f'Boxplot of {column} vs Target', fontsize=16)
  plt.xlabel('target', fontsize=14)
  plt.ylabel(column, fontsize=14)

  plt.show()
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='radius_mean'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of radius_mean vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'radius_mean')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='texture_mean'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of texture_mean vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'texture_mean')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='perimeter_mean'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of perimeter_mean vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'perimeter_mean')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='area_mean'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of area_mean vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'area_mean')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='smoothness_mean'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of smoothness_mean vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'smoothness_mean')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='compactness_mean'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of compactness_mean vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'compactness_mean')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='concavity_mean'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of concavity_mean vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'concavity_mean')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='concave points_mean'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of concave points_mean vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'concave points_mean')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='symmetry_mean'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of symmetry_mean vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'symmetry_mean')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='fractal_dimension_mean'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of fractal_dimension_mean vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'fractal_dimension_mean')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='radius_se'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of radius_se vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'radius_se')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='texture_se'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of texture_se vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'texture_se')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='perimeter_se'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of perimeter_se vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'perimeter_se')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='area_se'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of area_se vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'area_se')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='smoothness_se'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of smoothness_se vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'smoothness_se')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='compactness_se'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of compactness_se vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'compactness_se')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='concavity_se'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of concavity_se vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'concavity_se')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='concave points_se'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of concave points_se vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'concave points_se')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='symmetry_se'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of symmetry_se vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'symmetry_se')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='fractal_dimension_se'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of fractal_dimension_se vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'fractal_dimension_se')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='radius_worst'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of radius_worst vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'radius_worst')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='texture_worst'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of texture_worst vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'texture_worst')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='perimeter_worst'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of perimeter_worst vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'perimeter_worst')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='area_worst'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of area_worst vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'area_worst')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='smoothness_worst'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of smoothness_worst vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'smoothness_worst')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='compactness_worst'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of compactness_worst vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'compactness_worst')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='concavity_worst'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of concavity_worst vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'concavity_worst')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='concave points_worst'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of concave points_worst vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'concave points_worst')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='symmetry_worst'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of symmetry_worst vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'symmetry_worst')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='fractal_dimension_worst'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of fractal_dimension_worst vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'fractal_dimension_worst')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='age'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of age vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'age')
Out[12]:
<Figure size 600x300 with 0 Axes>
<ipython-input-12-c8da27f086f5>:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='target', y=column, data=train_data, orient="v", palette="deep")
Out[12]:
<Axes: xlabel='target', ylabel='duration'>
Out[12]:
Text(0.5, 1.0, 'Boxplot of duration vs Target')
Out[12]:
Text(0.5, 0, 'target')
Out[12]:
Text(0, 0.5, 'duration')

It appears that there are some patients with more then one outlier, we'll remove them from the data set.

In [13]:
train_data = pd.read_csv('train_data.csv')

numerical_columns = train_data.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = [col for col in numerical_columns if col != 'target']

outliers_mask = pd.Series([False] * len(train_data))

for column in numerical_columns:
    Q1 = train_data[column].quantile(0.25)
    Q3 = train_data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers_mask |= (train_data[column] < lower_bound) | (train_data[column] > upper_bound)

train_data = train_data[~outliers_mask]

rows_removed = outliers_mask.sum()
print(f"Number of patients removed: {rows_removed}")
train_data.to_csv('train_data_02.csv')
Number of patients removed: 138

EDA¶

Visualizing Features¶

Apply transformations based on the features's skewness and remove features that still have significant skewness after the transformations

In [14]:
fig, axes = plt.subplots(10, 4, figsize=(40, 40))

axes = axes.flatten()

for index, columnName in enumerate(train_data.columns[1:]):
    ax = axes[index]
    if train_data[columnName].dtype == 'object':
        sns.countplot(x=columnName, data=train_data, ax=ax)
    else:
        sns.histplot(x=columnName, data=train_data, ax=ax)

    ax.set_title(columnName)

plt.tight_layout()
plt.show()

numerical_features = train_data.select_dtypes(include=['int64', 'float64']).columns[1:]

skewed_features = train_data[numerical_features].skew().sort_values(ascending=False)

skewness_df = pd.DataFrame({'Skew': skewed_features})

print(skewness_df)
Out[14]:
<Axes: xlabel='diagnosis', ylabel='count'>
Out[14]:
Text(0.5, 1.0, 'diagnosis')
Out[14]:
<Axes: xlabel='radius_mean', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'radius_mean')
Out[14]:
<Axes: xlabel='texture_mean', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'texture_mean')
Out[14]:
<Axes: xlabel='perimeter_mean', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'perimeter_mean')
Out[14]:
<Axes: xlabel='area_mean', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'area_mean')
Out[14]:
<Axes: xlabel='smoothness_mean', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'smoothness_mean')
Out[14]:
<Axes: xlabel='compactness_mean', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'compactness_mean')
Out[14]:
<Axes: xlabel='concavity_mean', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'concavity_mean')
Out[14]:
<Axes: xlabel='concave points_mean', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'concave points_mean')
Out[14]:
<Axes: xlabel='symmetry_mean', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'symmetry_mean')
Out[14]:
<Axes: xlabel='fractal_dimension_mean', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'fractal_dimension_mean')
Out[14]:
<Axes: xlabel='radius_se', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'radius_se')
Out[14]:
<Axes: xlabel='texture_se', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'texture_se')
Out[14]:
<Axes: xlabel='perimeter_se', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'perimeter_se')
Out[14]:
<Axes: xlabel='area_se', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'area_se')
Out[14]:
<Axes: xlabel='smoothness_se', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'smoothness_se')
Out[14]:
<Axes: xlabel='compactness_se', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'compactness_se')
Out[14]:
<Axes: xlabel='concavity_se', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'concavity_se')
Out[14]:
<Axes: xlabel='concave points_se', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'concave points_se')
Out[14]:
<Axes: xlabel='symmetry_se', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'symmetry_se')
Out[14]:
<Axes: xlabel='fractal_dimension_se', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'fractal_dimension_se')
Out[14]:
<Axes: xlabel='radius_worst', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'radius_worst')
Out[14]:
<Axes: xlabel='texture_worst', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'texture_worst')
Out[14]:
<Axes: xlabel='perimeter_worst', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'perimeter_worst')
Out[14]:
<Axes: xlabel='area_worst', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'area_worst')
Out[14]:
<Axes: xlabel='smoothness_worst', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'smoothness_worst')
Out[14]:
<Axes: xlabel='compactness_worst', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'compactness_worst')
Out[14]:
<Axes: xlabel='concavity_worst', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'concavity_worst')
Out[14]:
<Axes: xlabel='concave points_worst', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'concave points_worst')
Out[14]:
<Axes: xlabel='symmetry_worst', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'symmetry_worst')
Out[14]:
<Axes: xlabel='fractal_dimension_worst', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'fractal_dimension_worst')
Out[14]:
<Axes: xlabel='age', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'age')
Out[14]:
<Axes: xlabel='stage_of_cancer', ylabel='count'>
Out[14]:
Text(0.5, 1.0, 'stage_of_cancer')
Out[14]:
<Axes: xlabel='treatment_administered', ylabel='count'>
Out[14]:
Text(0.5, 1.0, 'treatment_administered')
Out[14]:
<Axes: xlabel='duration', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'duration')
Out[14]:
<Axes: xlabel='target', ylabel='Count'>
Out[14]:
Text(0.5, 1.0, 'target')

Now we'll perform Log Transformation for the features that has right skewness and Square Root Transformation to the features that has left skewness.

In [15]:
# Threshold for skewness
threshold = 0.75

def transform_features(train_data, skewed_features, threshold=0.75):
    changed_features = {}
    for feature in skewed_features.index:
        skewness = skewed_features[feature]
        original_values = train_data[feature].copy()

        if skewness > threshold:
            if (train_data[feature] > 0).all():
                train_data[feature] = np.log1p(train_data[feature])
            else:
                print(f"Skipping {feature} due to negative or zero values in the data.")

        elif skewness < -threshold:
            train_data[feature] = np.sqrt(train_data[feature].abs())

        if not train_data[feature].equals(original_values):
            changed_features[feature] = {
                'original_skewness': skewness,
                'new_skewness': train_data[feature].skew()
            }

    return train_data, changed_features

train_data_transformed, changed_features = transform_features(train_data.copy(), skewed_features, threshold)

if changed_features:
    print("\nFeatures with Skewness Changes:")
    for feature, skewness_info in changed_features.items():
        print(f"Feature: {feature}")
        print(f"  Original Skewness: {skewness_info['original_skewness']}")
        print(f"  New Skewness: {skewness_info['new_skewness']}\n")
else:
    print("No features had significant changes in skewness.")

if changed_features:
    fig, axes = plt.subplots(3, 4, figsize=(40, 40))
    axes = axes.flatten() if len(changed_features) > 1 else [axes]

    for ax, (feature, _) in zip(axes, changed_features.items()):
        sns.histplot(train_data_transformed[feature], ax=ax)
        ax.set_title(f"{feature} (Transformed)")

    plt.tight_layout()
    plt.show()
                             Skew
area_se                  1.500621
concavity_mean           1.341517
area_worst               1.259710
fractal_dimension_se     1.152364
area_mean                1.139781
concave points_mean      1.109217
perimeter_se             1.056725
radius_se                1.045883
concavity_se             0.982360
compactness_se           0.938201
compactness_worst        0.927227
concavity_worst          0.905529
symmetry_se              0.823085
radius_worst             0.803780
perimeter_worst          0.779180
compactness_mean         0.727498
fractal_dimension_worst  0.726821
smoothness_se            0.694778
texture_se               0.665351
perimeter_mean           0.634102
radius_mean              0.609254
fractal_dimension_mean   0.545257
concave points_worst     0.535519
symmetry_worst           0.455974
texture_mean             0.377752
texture_worst            0.264952
smoothness_worst         0.241018
concave points_se        0.228033
smoothness_mean          0.184778
symmetry_mean            0.141812
age                      0.088668
duration                -0.069092
target                  -0.631235
Skipping concavity_mean due to negative or zero values in the data.
Skipping concave points_mean due to negative or zero values in the data.
Skipping concavity_se due to negative or zero values in the data.
Skipping concavity_worst due to negative or zero values in the data.

Features with Skewness Changes:
Feature: area_se
  Original Skewness: 1.500620894091251
  New Skewness: 0.37384820311316624

Feature: area_worst
  Original Skewness: 1.2597098098356303
  New Skewness: 0.20756307817678538

Feature: fractal_dimension_se
  Original Skewness: 1.1523641192918657
  New Skewness: 1.14801861775621

Feature: area_mean
  Original Skewness: 1.139780952552101
  New Skewness: -0.010332358714100688

Feature: perimeter_se
  Original Skewness: 1.056724920701799
  New Skewness: 0.43460137178308555

Feature: radius_se
  Original Skewness: 1.045883338017319
  New Skewness: 0.8134414067927772

Feature: compactness_se
  Original Skewness: 0.938200774575379
  New Skewness: 0.9140320773551073

Feature: compactness_worst
  Original Skewness: 0.9272273016618426
  New Skewness: 0.7074258537878015

Feature: symmetry_se
  Original Skewness: 0.8230851527881472
  New Skewness: 0.8096950632228345

Feature: radius_worst
  Original Skewness: 0.8037804580212827
  New Skewness: 0.2781036839218181

Feature: perimeter_worst
  Original Skewness: 0.779180092711607
  New Skewness: 0.22718500225809612

Out[15]:
<Axes: xlabel='area_se', ylabel='Count'>
Out[15]:
Text(0.5, 1.0, 'area_se (Transformed)')
Out[15]:
<Axes: xlabel='area_worst', ylabel='Count'>
Out[15]:
Text(0.5, 1.0, 'area_worst (Transformed)')
Out[15]:
<Axes: xlabel='fractal_dimension_se', ylabel='Count'>
Out[15]:
Text(0.5, 1.0, 'fractal_dimension_se (Transformed)')
Out[15]:
<Axes: xlabel='area_mean', ylabel='Count'>
Out[15]:
Text(0.5, 1.0, 'area_mean (Transformed)')
Out[15]:
<Axes: xlabel='perimeter_se', ylabel='Count'>
Out[15]:
Text(0.5, 1.0, 'perimeter_se (Transformed)')
Out[15]:
<Axes: xlabel='radius_se', ylabel='Count'>
Out[15]:
Text(0.5, 1.0, 'radius_se (Transformed)')
Out[15]:
<Axes: xlabel='compactness_se', ylabel='Count'>
Out[15]:
Text(0.5, 1.0, 'compactness_se (Transformed)')
Out[15]:
<Axes: xlabel='compactness_worst', ylabel='Count'>
Out[15]:
Text(0.5, 1.0, 'compactness_worst (Transformed)')
Out[15]:
<Axes: xlabel='symmetry_se', ylabel='Count'>
Out[15]:
Text(0.5, 1.0, 'symmetry_se (Transformed)')
Out[15]:
<Axes: xlabel='radius_worst', ylabel='Count'>
Out[15]:
Text(0.5, 1.0, 'radius_worst (Transformed)')
Out[15]:
<Axes: xlabel='perimeter_worst', ylabel='Count'>
Out[15]:
Text(0.5, 1.0, 'perimeter_worst (Transformed)')

After the transformations, we'll remove the features that still has high skewness

In [16]:
final_skewness = train_data_transformed[numerical_features].skew().sort_values(ascending=False)

features_to_remove = final_skewness[final_skewness.abs() > threshold].index

if len(features_to_remove) > 0:
    print(f"\nFeatures to be removed due to high skewness (>{threshold}):")
    for feature in features_to_remove:
        print(f" - {feature}")
else:
    print("\nNo features to remove. All features are within the skewness threshold.")

train_data_cleaned = train_data_transformed.drop(columns=features_to_remove)

print(f"\nNumber of features removed: {len(features_to_remove)}")

train_data_cleaned.to_csv('train_data_03.csv', index=False)
Features to be removed due to high skewness (>0.75):
 - concavity_mean
 - fractal_dimension_se
 - concave points_mean
 - concavity_se
 - compactness_se
 - concavity_worst
 - radius_se
 - symmetry_se

Number of features removed: 8

Feature Selection Based on Correlation Matrix¶

Iterative process where we remove one feature at a time, re-calculate the correlation matrix, and then reassess which features to remove next. This approach can help address any changes in correlation dynamics after each feature is removed.

In [17]:
numerical_data = train_data_cleaned.select_dtypes(include=['float64', 'int64'])

correlation_threshold = 0.8

def find_most_correlated(data, threshold):
    correlation_matrix = data.corr()
    features = correlation_matrix.columns
    for i in range(len(features)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                return (features[i], features[j])
    return None

while True:
    pair = find_most_correlated(numerical_data, correlation_threshold)
    if pair is None:
        break
    feature_to_remove = pair[0]
    numerical_data = numerical_data.drop(columns=[feature_to_remove])
    print(f"Removed {feature_to_remove} due to high correlation with {pair[1]}")

plt.figure(figsize=(20, 20))
sns.heatmap(numerical_data.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Final Correlation Matrix for Numerical Features')
plt.show()

numerical_data.to_csv('reduced_data.csv', index=False)
Removed perimeter_mean due to high correlation with radius_mean
Removed area_mean due to high correlation with radius_mean
Removed area_se due to high correlation with perimeter_se
Removed radius_worst due to high correlation with radius_mean
Removed texture_worst due to high correlation with texture_mean
Removed perimeter_worst due to high correlation with radius_mean
Removed area_worst due to high correlation with radius_mean
Removed smoothness_worst due to high correlation with smoothness_mean
Removed compactness_worst due to high correlation with compactness_mean
Out[17]:
<Figure size 2000x2000 with 0 Axes>
Out[17]:
<Axes: >
Out[17]:
Text(0.5, 1.0, 'Final Correlation Matrix for Numerical Features')

Feature Scaling¶

Standardize or normalize your data especially for ANN.

In [18]:
scaler = StandardScaler()
data[data.columns] = scaler.fit_transform(data[data.columns])

Modularized Data Processing Functions for Predictive Models:¶

In [19]:
def create_target_column(dataset):
    dataset['target'] = (dataset['survival_probability'] > 0.5).astype(int)
    dataset.drop(columns=['survival_probability'], inplace=True)
    return dataset

def get_outliers_mask(dataset):

    numerical_columns = dataset.select_dtypes(include=['int64', 'float64']).columns
    outliers_mask = pd.Series([False] * len(dataset), index=dataset.index)

    for column in numerical_columns:
        Q1 = dataset[column].quantile(0.25)
        Q3 = dataset[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        outliers_mask |= (dataset[column] < lower_bound) | (dataset[column] > upper_bound)

    return outliers_mask

def outliers_removal(X, y):

    outliers_mask = get_outliers_mask(X)
    X = X[~outliers_mask]
    y = y.loc[X.index]
    return X, y

def impute_values(X):

    X.replace('?', pd.NA, inplace=True)
    X.dropna(inplace=True)
    return X

def transform_features(X, skewness_threshold=0.75):
    numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns

    skewed_features = X[numerical_columns].skew().sort_values(ascending=False)

    for feature in skewed_features.index:
        skewness = skewed_features[feature]

        if skewness > skewness_threshold:
            if (X[feature] > 0).all():
                X[feature] = np.log1p(X[feature])
            else:
                print(f"Skipping log transformation for {feature} due to zero or negative values.")
        elif skewness < -skewness_threshold:
            X[feature] = np.sqrt(np.abs(X[feature]))

    return X

def remove_skewed_features(X, skewness_threshold=0.75):
    numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
    skewed_features = X[numerical_columns].skew().sort_values(ascending=False)

    features_to_remove = skewed_features[abs(skewed_features) > skewness_threshold].index

    if len(features_to_remove) > 0:
        X = X.drop(columns=features_to_remove)

    return X

def one_hot_encode(X):
    categorical_columns = ['diagnosis', 'stage_of_cancer', 'treatment_administered']
    X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)
    return X

def feature_selection_correlation(X, correlation_threshold=0.8):
    def find_most_correlated(data, threshold):
        correlation_matrix = data.corr()
        features = correlation_matrix.columns
        max_correlation = threshold
        feature_to_remove = None
        for i in range(len(features)):
            for j in range(i):
                if abs(correlation_matrix.iloc[i, j]) > max_correlation:
                    max_correlation = abs(correlation_matrix.iloc[i, j])
                    feature_to_remove = features[i] if abs(correlation_matrix.iloc[i, j]) >= abs(correlation_matrix.iloc[j, i]) else features[j]
        return feature_to_remove

    while True:
        feature_to_remove = find_most_correlated(X, correlation_threshold)
        if feature_to_remove is None:
            break
        X = X.drop(columns=[feature_to_remove])

    return X

def split_data(dataset):
    X = dataset.drop(columns=['target'])
    y = dataset['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    return X_train, X_test, y_train, y_test

def apply_transformations(X, y):
    X, y = outliers_removal(X, y)
    X = impute_values(X)
    X = transform_features(X)
    X = remove_skewed_features(X)
    X = one_hot_encode(X)
    X = feature_selection_correlation(X)

    return X, y

def process_data(dataset):
    dataset = create_target_column(dataset)
    X_train, X_test, y_train, y_test = split_data(dataset)
    X_train, y_train = apply_transformations(X_train, y_train)
    X_test, y_test = apply_transformations(X_test, y_test)
    common_columns = X_train.columns.intersection(X_test.columns)
    X_train = X_train[common_columns]
    X_test = X_test[common_columns]

    return X_train, X_test, y_train, y_test

Breakdown of the Functions¶

  1. Create Target Column and Remove Survival Probability
  2. Outliers Removal: Identifies and removes outliers using the IQR method for all numerical features.
  3. Impute Missing Values: Handles missing values by replacing ? with NaN and dropping rows with missing values.
  4. Transformation for Skewness Correction: Applies log transformation for right-skewed data and square root transformation for left-skewed data.
  5. Remove Highly Skewed Features: Removes features with high skewness that remain even after transformations.
  6. One-Hot Encoding: Converts categorical variables into dummy variables (binary format), excluding the first category to avoid multicollinearity.
  7. Feature Selection Based on Correlation: Identifies and removes features that have a correlation higher than the specified threshold (e.g., 0.8).
  8. Split Data into Train and Test Sets (with Stratification).
In [20]:
td = pd.read_csv('data.csv')
X_train, X_test, y_train, y_test = process_data(td)
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
results = {}
<ipython-input-19-b0ac60b7fdca>:31: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace('?', pd.NA, inplace=True)
<ipython-input-19-b0ac60b7fdca>:32: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.dropna(inplace=True)
<ipython-input-19-b0ac60b7fdca>:45: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[feature] = np.log1p(X[feature])
Skipping log transformation for concavity_mean due to zero or negative values.
Skipping log transformation for concave points_mean due to zero or negative values.
Skipping log transformation for concavity_se due to zero or negative values.
Skipping log transformation for concavity_worst due to zero or negative values.
Skipping log transformation for concavity_mean due to zero or negative values.
Skipping log transformation for concave points_mean due to zero or negative values.
Skipping log transformation for concavity_se due to zero or negative values.
Skipping log transformation for concavity_worst due to zero or negative values.
X_train shape: (255, 21)
X_test shape: (61, 21)
y_train shape: (255,)
y_test shape: (61,)
<ipython-input-19-b0ac60b7fdca>:31: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace('?', pd.NA, inplace=True)
<ipython-input-19-b0ac60b7fdca>:32: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.dropna(inplace=True)
<ipython-input-19-b0ac60b7fdca>:45: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[feature] = np.log1p(X[feature])

Logistic Regression¶

Basic Training Function¶

This function will just train the logistic regression model on the entire training set without any cross-validation.

In [21]:
def train_logistic_regression(X_train, y_train):
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    return model

Cross-Validation Training Function¶

This function will perform cross-validation using 5 folds.

In [22]:
def train_and_evaluate_logistic_regression_with_cv(X_train, y_train, n_splits=5):
    model = LogisticRegression(max_iter=1000, random_state=42)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    cv_results = []
    for train_index, val_index in kf.split(X_train):
        X_train_k, X_val_k = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_k, y_val_k = y_train.iloc[train_index], y_train.iloc[val_index]
        model.fit(X_train_k, y_train_k)
        results = evaluate_logistic_regression(model, X_val_k, y_val_k, f"Validation Fold {len(cv_results)+1}")
        cv_results.append(results)

    # Calculate average of the results
    average_results = pd.DataFrame(cv_results).mean().to_dict()
    print(f"Average K-Folds results: {average_results}")

    return model, average_results

Evaluation Function¶

This function evaluates the model and returns the results.

In [23]:
def evaluate_logistic_regression(model, X, y, dataset_name):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1]

    accuracy = accuracy_score(y, y_pred)
    roc_auc = roc_auc_score(y, y_prob)
    cm = confusion_matrix(y, y_pred)
    sensitivity = cm[1, 1] / (cm[1, 1] + cm[1, 0])
    specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])

    plt.figure(figsize=(6, 3))
    fpr, tpr, _ = roc_curve(y, y_prob)
    plt.plot(fpr, tpr, label=f'{dataset_name} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {dataset_name}')
    plt.legend()
    plt.show()

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", cbar=False, square=True)
    plt.title(f'Confusion Matrix - {dataset_name}')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

    return {'Accuracy': accuracy, 'ROC AUC': roc_auc, 'Sensitivity': sensitivity, 'Specificity': specificity}
In [24]:
initial_lr_model = train_logistic_regression(X_train, y_train)

original_train_results = evaluate_logistic_regression(initial_lr_model, X_train, y_train, "Initial Train Set")

cv_lr_model, cv_train_results = train_and_evaluate_logistic_regression_with_cv(X_train, y_train)

test_results = evaluate_logistic_regression(cv_lr_model, X_test, y_test, "Test Set")

test_results = evaluate_logistic_regression(cv_lr_model, X_test, y_test, "Test Set")

results_df = pd.DataFrame({
    'Logistic Regression': original_train_results,
    'Logistic Regression (CV)': cv_train_results,
    'Logistic Regression (Test)': test_results
}).T

display(results_df)
Average K-Folds results: {'Accuracy': 0.9254901960784314, 'ROC AUC': 0.9819241461520323, 'Sensitivity': 0.9485377236193934, 'Specificity': 0.8938641069075853}
Accuracy ROC AUC Sensitivity Specificity
Logistic Regression 0.952941 0.992431 0.962025 0.938144
Logistic Regression (CV) 0.925490 0.981924 0.948538 0.893864
Logistic Regression (Test) 0.918033 0.970862 0.923077 0.909091

Decision Tree¶

Basic train of the Decision Tree Model¶

In [25]:
def train_decision_tree(X_train, y_train, criterion='gini', max_depth=3, min_samples_split=10, min_samples_leaf=5):
    dt_model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth,
                                      min_samples_split=min_samples_split,
                                      min_samples_leaf=min_samples_leaf,
                                      random_state=42)
    dt_model.fit(X_train, y_train)
    return dt_model

Train the Desicion Tree with 5-Folds Cross-Validation¶

In [26]:
def train_decision_tree_with_cv(X_train, y_train, n_splits=5, criterion='gini', max_depth=3, min_samples_split=10, min_samples_leaf=5):
    model = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth,
                                   min_samples_split=min_samples_split,
                                   min_samples_leaf=min_samples_leaf,
                                   random_state=42)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    for train_index, val_index in kf.split(X_train):
        X_train_k, _ = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_k, _ = y_train.iloc[train_index], y_train.iloc[val_index]
        model.fit(X_train_k, y_train_k)

    return model

Evaluate the Decision Tree Model¶

In [27]:
def visualize_tree(dt_model, feature_names):
    plt.figure(figsize=(20, 10))
    plot_tree(dt_model, filled=True, feature_names=feature_names, class_names=['0', '1'], rounded=True)
    plt.title('Decision Tree Visualization')
    plt.show()

def evaluate_decision_tree(model, X, y, dataset_name):
    y_pred = model.predict(X)
    y_prob = model.predict_proba(X)[:, 1]

    accuracy = accuracy_score(y, y_pred)
    roc_auc = roc_auc_score(y, y_prob)
    cm = confusion_matrix(y, y_pred)
    sensitivity = cm[1, 1] / (cm[1, 1] + cm[1, 0])
    specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])

    plt.figure(figsize=(6, 3))
    fpr, tpr, _ = roc_curve(y, y_prob)
    plt.plot(fpr, tpr, label=f'{dataset_name} (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {dataset_name}')
    plt.legend()
    plt.show()

    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", cbar=False, square=True)
    plt.title(f'Confusion Matrix - {dataset_name}')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

    return {'Accuracy': accuracy, 'ROC AUC': roc_auc, 'Sensitivity': sensitivity, 'Specificity': specificity}
In [28]:
dt_initial = train_decision_tree(X_train, y_train)
initial_train_results_dt = evaluate_decision_tree(dt_initial, X_train, y_train, "Decision Tree - Initial Train Set")
visualize_tree(dt_initial, X_train.columns)

results["Decision Tree - Initial Train Set"] = initial_train_results_dt

results_df = pd.DataFrame({
    'Logistic Regression': original_train_results,
    'Logistic Regression (CV)': cv_train_results,
    'Logistic Regression (Test)': test_results,
    'Decision Tree': initial_train_results_dt,
}).T
display(results_df)
Accuracy ROC AUC Sensitivity Specificity
Logistic Regression 0.952941 0.992431 0.962025 0.938144
Logistic Regression (CV) 0.925490 0.981924 0.948538 0.893864
Logistic Regression (Test) 0.918033 0.970862 0.923077 0.909091
Decision Tree 0.862745 0.943886 0.936709 0.742268
In [29]:
dt_cv = train_decision_tree_with_cv(X_train, y_train)
cv_train_results_dt = evaluate_decision_tree(dt_cv, X_train, y_train, "Decision Tree - CV Train Set")
visualize_tree(dt_cv, X_train.columns)
results["Decision Tree - CV Train Set"] = cv_train_results_dt

results_df = pd.DataFrame({
    'Logistic Regression': original_train_results,
    'Logistic Regression (CV)': cv_train_results,
    'Logistic Regression (Test)': test_results,
    'Decision Tree': initial_train_results_dt,
    'Decision Tree (CV)': cv_train_results_dt
}).T
display(results_df)
Accuracy ROC AUC Sensitivity Specificity
Logistic Regression 0.952941 0.992431 0.962025 0.938144
Logistic Regression (CV) 0.925490 0.981924 0.948538 0.893864
Logistic Regression (Test) 0.918033 0.970862 0.923077 0.909091
Decision Tree 0.862745 0.943886 0.936709 0.742268
Decision Tree (CV) 0.858824 0.924670 0.822785 0.917526
In [30]:
test_results_dt = evaluate_decision_tree(dt_cv, X_test, y_test, "Decision Tree - Test Set")
results["Decision Tree - Test Set"] = test_results_dt

results_df = pd.DataFrame({
    'Logistic Regression': original_train_results,
    'Logistic Regression (CV)': cv_train_results,
    'Logistic Regression (Test)': test_results,
    'Decision Tree': initial_train_results_dt,
    'Decision Tree (CV)': cv_train_results_dt,
    'Decision Tree (Test)': test_results_dt
}).T
display(results_df)
Accuracy ROC AUC Sensitivity Specificity
Logistic Regression 0.952941 0.992431 0.962025 0.938144
Logistic Regression (CV) 0.925490 0.981924 0.948538 0.893864
Logistic Regression (Test) 0.918033 0.970862 0.923077 0.909091
Decision Tree 0.862745 0.943886 0.936709 0.742268
Decision Tree (CV) 0.858824 0.924670 0.822785 0.917526
Decision Tree (Test) 0.786885 0.828089 0.820513 0.727273

Simulating CHAID

In [31]:
dt_model_gini = train_decision_tree(X_train, y_train, criterion='gini', max_depth=3)
evaluate_decision_tree(dt_model_gini, X_test, y_test, 'Decision Tree - Gini - Limited Depth')
visualize_tree(dt_model_gini, X_train.columns)

dt_model_entropy = train_decision_tree(X_train, y_train, criterion='entropy', max_depth=3)
evaluate_decision_tree(dt_model_entropy, X_test, y_test, 'Decision Tree - Entropy - Limited Depth')
visualize_tree(dt_model_entropy, X_train.columns)

results_df = pd.DataFrame({
    'Logistic Regression': original_train_results,
    'Logistic Regression (CV)': cv_train_results,
    'Logistic Regression (Test)': test_results,
    'Decision Tree': initial_train_results_dt,
    'Decision Tree (CV)': cv_train_results_dt,
    'Decision Tree (Test)': test_results_dt,
    'Decision Tree - Gini - Limited Depth': evaluate_decision_tree(dt_model_gini, X_test, y_test, 'Decision Tree - Gini - Limited Depth'),
    'Decision Tree - Entropy - Limited Depth': evaluate_decision_tree(dt_model_entropy, X_test, y_test, 'Decision Tree - Entropy - Limited Depth')
}).T
display(results_df)
Out[31]:
{'Accuracy': 0.8032786885245902,
 'ROC AUC': 0.8315850815850816,
 'Sensitivity': 0.9487179487179487,
 'Specificity': 0.5454545454545454}
Out[31]:
{'Accuracy': 0.7868852459016393,
 'ROC AUC': 0.784965034965035,
 'Sensitivity': 0.8974358974358975,
 'Specificity': 0.5909090909090909}
Accuracy ROC AUC Sensitivity Specificity
Logistic Regression 0.952941 0.992431 0.962025 0.938144
Logistic Regression (CV) 0.925490 0.981924 0.948538 0.893864
Logistic Regression (Test) 0.918033 0.970862 0.923077 0.909091
Decision Tree 0.862745 0.943886 0.936709 0.742268
Decision Tree (CV) 0.858824 0.924670 0.822785 0.917526
Decision Tree (Test) 0.786885 0.828089 0.820513 0.727273
Decision Tree - Gini - Limited Depth 0.803279 0.831585 0.948718 0.545455
Decision Tree - Entropy - Limited Depth 0.786885 0.784965 0.897436 0.590909

ANN (Artificial Neural Networks)¶

  1. Model Setup: We'll use MLPClassifier from sklearn, which is a feedforward neural network implementation.
  2. Network Architecture: We'll start with a simple architecture (one hidden layer) and gradually increase complexity if needed.
  3. Training and Evaluation: The process will be similar to what we've done for Logistic Regression and Decision Trees, using ROC AUC, Sensitivity, and Specificity for evaluation.

Train the Neural Network (ANN)

In [32]:
def train_neural_network(X_train, y_train, hidden_layer_sizes=(100,), activation='relu', max_iter=1000):
    ann_model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, max_iter=max_iter, random_state=42, verbose=True)
    ann_model.fit(X_train, y_train)
    return ann_model

Train ANN with K-Folds Cross-Validation¶

In [33]:
def train_neural_network_with_kfolds(X, y, n_splits=5, hidden_layer_sizes=(100,), activation='relu', max_iter=1000):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    models = []
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        ann_model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, max_iter=max_iter, random_state=42, verbose=True)
        ann_model.fit(X_train, y_train)

        models.append(ann_model)

    return models

Evaluate the Neural Network

In [34]:
def evaluate_neural_network(ann_model, X_train, y_train, model_name):
    y_pred = ann_model.predict(X_train)
    y_prob = ann_model.predict_proba(X_train)[:, 1]

    roc_auc = roc_auc_score(y_train, y_prob)

    fpr, tpr, _ = roc_curve(y_train, y_prob)
    plt.figure()
    plt.plot(fpr, tpr, label=f'Neural Network (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve - Neural Network')
    plt.legend()
    plt.show()

    cm = confusion_matrix(y_train, y_pred)
    sensitivity = cm[1, 1] / (cm[1, 1] + cm[1, 0])
    specificity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
    accuracy = accuracy_score(y_train, y_pred)

    sns.heatmap(cm, annot=True, fmt="d", cmap="Greens")
    plt.title('Confusion Matrix - Neural Network')
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    plt.show()

    return {'Accuracy': accuracy, 'ROC AUC': roc_auc, 'Sensitivity': sensitivity, 'Specificity': specificity}
In [35]:
ann_model = train_neural_network(X_train, y_train, hidden_layer_sizes=(100,), activation='relu', max_iter=1000)
initial_train_results_ann = evaluate_neural_network(ann_model, X_train, y_train, 'Artificial Neural Network')
results["ANN - Initial Train Set"] = initial_train_results_ann


results_df = pd.DataFrame({
    'Logistic Regression': original_train_results,
    'Logistic Regression (CV)': cv_train_results,
    'Logistic Regression (Test)': test_results,
    'Decision Tree - Initial Train Set': initial_train_results_dt,
    'Decision Tree - CV Train Set': cv_train_results_dt,
    'Decision Tree - Test Set': test_results_dt,
    'Decision Tree - Gini - Limited Depth': evaluate_decision_tree(dt_model_gini, X_test, y_test, 'Decision Tree - Gini - Limited Depth'),
    'Decision Tree - Entropy - Limited Depth': evaluate_decision_tree(dt_model_entropy, X_test, y_test, 'Decision Tree - Entropy - Limited Depth'),
    'ANN': initial_train_results_ann
}).T
display(results_df)
Iteration 1, loss = 4.02230292
Iteration 2, loss = 2.10454022
Iteration 3, loss = 0.86947080
Iteration 4, loss = 0.63834363
Iteration 5, loss = 0.92990311
Iteration 6, loss = 1.12074851
Iteration 7, loss = 1.02213408
Iteration 8, loss = 0.80638496
Iteration 9, loss = 0.64397050
Iteration 10, loss = 0.61286203
Iteration 11, loss = 0.65718838
Iteration 12, loss = 0.70880343
Iteration 13, loss = 0.71580951
Iteration 14, loss = 0.67234684
Iteration 15, loss = 0.62725891
Iteration 16, loss = 0.58560556
Iteration 17, loss = 0.57560980
Iteration 18, loss = 0.57066244
Iteration 19, loss = 0.56298485
Iteration 20, loss = 0.54736251
Iteration 21, loss = 0.52878874
Iteration 22, loss = 0.51326551
Iteration 23, loss = 0.49312804
Iteration 24, loss = 0.48197188
Iteration 25, loss = 0.47281459
Iteration 26, loss = 0.46184911
Iteration 27, loss = 0.44970322
Iteration 28, loss = 0.43596365
Iteration 29, loss = 0.41821720
Iteration 30, loss = 0.40762723
Iteration 31, loss = 0.39689920
Iteration 32, loss = 0.38930702
Iteration 33, loss = 0.38296267
Iteration 34, loss = 0.37670067
Iteration 35, loss = 0.36881320
Iteration 36, loss = 0.36026601
Iteration 37, loss = 0.36009198
Iteration 38, loss = 0.35783499
Iteration 39, loss = 0.34957359
Iteration 40, loss = 0.34125632
Iteration 41, loss = 0.33902456
Iteration 42, loss = 0.33582467
Iteration 43, loss = 0.32950126
Iteration 44, loss = 0.32532578
Iteration 45, loss = 0.32451970
Iteration 46, loss = 0.32281269
Iteration 47, loss = 0.32066127
Iteration 48, loss = 0.31576226
Iteration 49, loss = 0.31280710
Iteration 50, loss = 0.31048232
Iteration 51, loss = 0.30959967
Iteration 52, loss = 0.30675096
Iteration 53, loss = 0.30470100
Iteration 54, loss = 0.30382580
Iteration 55, loss = 0.30156411
Iteration 56, loss = 0.30085709
Iteration 57, loss = 0.29825943
Iteration 58, loss = 0.29573218
Iteration 59, loss = 0.29429188
Iteration 60, loss = 0.29269111
Iteration 61, loss = 0.29140750
Iteration 62, loss = 0.29282380
Iteration 63, loss = 0.29039945
Iteration 64, loss = 0.28653339
Iteration 65, loss = 0.28746691
Iteration 66, loss = 0.28743537
Iteration 67, loss = 0.28227269
Iteration 68, loss = 0.28445777
Iteration 69, loss = 0.28291903
Iteration 70, loss = 0.27880415
Iteration 71, loss = 0.27871387
Iteration 72, loss = 0.27958179
Iteration 73, loss = 0.27518067
Iteration 74, loss = 0.27381807
Iteration 75, loss = 0.27147686
Iteration 76, loss = 0.27001834
Iteration 77, loss = 0.26881486
Iteration 78, loss = 0.26853387
Iteration 79, loss = 0.26798311
Iteration 80, loss = 0.26576272
Iteration 81, loss = 0.26483172
Iteration 82, loss = 0.26545454
Iteration 83, loss = 0.26324829
Iteration 84, loss = 0.26074457
Iteration 85, loss = 0.26010622
Iteration 86, loss = 0.25794394
Iteration 87, loss = 0.25850639
Iteration 88, loss = 0.25669207
Iteration 89, loss = 0.25484294
Iteration 90, loss = 0.25658803
Iteration 91, loss = 0.25209274
Iteration 92, loss = 0.25700625
Iteration 93, loss = 0.25482831
Iteration 94, loss = 0.24928817
Iteration 95, loss = 0.25103163
Iteration 96, loss = 0.24688681
Iteration 97, loss = 0.24745477
Iteration 98, loss = 0.25403423
Iteration 99, loss = 0.24920378
Iteration 100, loss = 0.24413673
Iteration 101, loss = 0.24500362
Iteration 102, loss = 0.24273261
Iteration 103, loss = 0.24267903
Iteration 104, loss = 0.24122914
Iteration 105, loss = 0.23857052
Iteration 106, loss = 0.23781333
Iteration 107, loss = 0.23669402
Iteration 108, loss = 0.23579680
Iteration 109, loss = 0.23448541
Iteration 110, loss = 0.23481622
Iteration 111, loss = 0.23363717
Iteration 112, loss = 0.23239794
Iteration 113, loss = 0.23182923
Iteration 114, loss = 0.23044262
Iteration 115, loss = 0.22943965
Iteration 116, loss = 0.23016530
Iteration 117, loss = 0.22802171
Iteration 118, loss = 0.22735506
Iteration 119, loss = 0.23126033
Iteration 120, loss = 0.22780703
Iteration 121, loss = 0.22509048
Iteration 122, loss = 0.22728015
Iteration 123, loss = 0.22363601
Iteration 124, loss = 0.22647265
Iteration 125, loss = 0.22786131
Iteration 126, loss = 0.22139982
Iteration 127, loss = 0.22084346
Iteration 128, loss = 0.21927932
Iteration 129, loss = 0.21913891
Iteration 130, loss = 0.21673575
Iteration 131, loss = 0.21834780
Iteration 132, loss = 0.22036976
Iteration 133, loss = 0.21580011
Iteration 134, loss = 0.21415451
Iteration 135, loss = 0.21484682
Iteration 136, loss = 0.21173361
Iteration 137, loss = 0.21456477
Iteration 138, loss = 0.21810526
Iteration 139, loss = 0.21252006
Iteration 140, loss = 0.21039078
Iteration 141, loss = 0.21035409
Iteration 142, loss = 0.20973249
Iteration 143, loss = 0.20748699
Iteration 144, loss = 0.20696330
Iteration 145, loss = 0.20592567
Iteration 146, loss = 0.20578461
Iteration 147, loss = 0.20528378
Iteration 148, loss = 0.20517077
Iteration 149, loss = 0.20504235
Iteration 150, loss = 0.20473887
Iteration 151, loss = 0.20418614
Iteration 152, loss = 0.20357433
Iteration 153, loss = 0.20130895
Iteration 154, loss = 0.20459066
Iteration 155, loss = 0.20086647
Iteration 156, loss = 0.20362074
Iteration 157, loss = 0.20427172
Iteration 158, loss = 0.19801184
Iteration 159, loss = 0.19825854
Iteration 160, loss = 0.19987893
Iteration 161, loss = 0.19808404
Iteration 162, loss = 0.19706006
Iteration 163, loss = 0.19545960
Iteration 164, loss = 0.19198142
Iteration 165, loss = 0.19874646
Iteration 166, loss = 0.19975094
Iteration 167, loss = 0.18969855
Iteration 168, loss = 0.19592014
Iteration 169, loss = 0.20318988
Iteration 170, loss = 0.18861323
Iteration 171, loss = 0.19687122
Iteration 172, loss = 0.19971058
Iteration 173, loss = 0.18872662
Iteration 174, loss = 0.19407785
Iteration 175, loss = 0.18840658
Iteration 176, loss = 0.18642795
Iteration 177, loss = 0.20257853
Iteration 178, loss = 0.19036916
Iteration 179, loss = 0.18954640
Iteration 180, loss = 0.19262127
Iteration 181, loss = 0.18555495
Iteration 182, loss = 0.18221644
Iteration 183, loss = 0.18179603
Iteration 184, loss = 0.18384272
Iteration 185, loss = 0.18038333
Iteration 186, loss = 0.18837300
Iteration 187, loss = 0.18552340
Iteration 188, loss = 0.17813300
Iteration 189, loss = 0.17841459
Iteration 190, loss = 0.17746224
Iteration 191, loss = 0.17615905
Iteration 192, loss = 0.17706448
Iteration 193, loss = 0.17445022
Iteration 194, loss = 0.17870040
Iteration 195, loss = 0.17945170
Iteration 196, loss = 0.17474412
Iteration 197, loss = 0.17325657
Iteration 198, loss = 0.17223660
Iteration 199, loss = 0.17192794
Iteration 200, loss = 0.17387431
Iteration 201, loss = 0.17361957
Iteration 202, loss = 0.17248654
Iteration 203, loss = 0.17401968
Iteration 204, loss = 0.17101373
Iteration 205, loss = 0.17081506
Iteration 206, loss = 0.17097379
Iteration 207, loss = 0.16758544
Iteration 208, loss = 0.16972054
Iteration 209, loss = 0.17010854
Iteration 210, loss = 0.16597442
Iteration 211, loss = 0.16795522
Iteration 212, loss = 0.16823160
Iteration 213, loss = 0.16418037
Iteration 214, loss = 0.16697008
Iteration 215, loss = 0.16582084
Iteration 216, loss = 0.16382480
Iteration 217, loss = 0.16484471
Iteration 218, loss = 0.16243197
Iteration 219, loss = 0.16218383
Iteration 220, loss = 0.16236868
Iteration 221, loss = 0.16106752
Iteration 222, loss = 0.16034033
Iteration 223, loss = 0.16019903
Iteration 224, loss = 0.15939317
Iteration 225, loss = 0.15812903
Iteration 226, loss = 0.15967220
Iteration 227, loss = 0.15972957
Iteration 228, loss = 0.15869792
Iteration 229, loss = 0.15701914
Iteration 230, loss = 0.15660028
Iteration 231, loss = 0.15681639
Iteration 232, loss = 0.15677736
Iteration 233, loss = 0.15779404
Iteration 234, loss = 0.16085464
Iteration 235, loss = 0.17011883
Iteration 236, loss = 0.15488930
Iteration 237, loss = 0.16842559
Iteration 238, loss = 0.15835579
Iteration 239, loss = 0.15875102
Iteration 240, loss = 0.16379257
Iteration 241, loss = 0.15261567
Iteration 242, loss = 0.15714449
Iteration 243, loss = 0.15278826
Iteration 244, loss = 0.15089522
Iteration 245, loss = 0.15671092
Iteration 246, loss = 0.15103484
Iteration 247, loss = 0.15231768
Iteration 248, loss = 0.15242954
Iteration 249, loss = 0.14729689
Iteration 250, loss = 0.14944591
Iteration 251, loss = 0.15086011
Iteration 252, loss = 0.14731480
Iteration 253, loss = 0.14643971
Iteration 254, loss = 0.14697819
Iteration 255, loss = 0.14535753
Iteration 256, loss = 0.14441778
Iteration 257, loss = 0.14441094
Iteration 258, loss = 0.14371784
Iteration 259, loss = 0.14410804
Iteration 260, loss = 0.14373181
Iteration 261, loss = 0.14281049
Iteration 262, loss = 0.14266847
Iteration 263, loss = 0.14150942
Iteration 264, loss = 0.14135549
Iteration 265, loss = 0.14306235
Iteration 266, loss = 0.13924119
Iteration 267, loss = 0.14683114
Iteration 268, loss = 0.14989169
Iteration 269, loss = 0.14164417
Iteration 270, loss = 0.14145437
Iteration 271, loss = 0.13924703
Iteration 272, loss = 0.13965202
Iteration 273, loss = 0.14137038
Iteration 274, loss = 0.14434584
Iteration 275, loss = 0.13998019
Iteration 276, loss = 0.14006837
Iteration 277, loss = 0.13590659
Iteration 278, loss = 0.14019764
Iteration 279, loss = 0.14451119
Iteration 280, loss = 0.13560581
Iteration 281, loss = 0.13674836
Iteration 282, loss = 0.13751512
Iteration 283, loss = 0.13359945
Iteration 284, loss = 0.13567546
Iteration 285, loss = 0.13645231
Iteration 286, loss = 0.13285221
Iteration 287, loss = 0.13333664
Iteration 288, loss = 0.13635857
Iteration 289, loss = 0.13312503
Iteration 290, loss = 0.13223208
Iteration 291, loss = 0.13965799
Iteration 292, loss = 0.13122649
Iteration 293, loss = 0.13519012
Iteration 294, loss = 0.13540466
Iteration 295, loss = 0.13035279
Iteration 296, loss = 0.13180443
Iteration 297, loss = 0.12740952
Iteration 298, loss = 0.13376283
Iteration 299, loss = 0.13312770
Iteration 300, loss = 0.12937742
Iteration 301, loss = 0.13417229
Iteration 302, loss = 0.12919634
Iteration 303, loss = 0.12877167
Iteration 304, loss = 0.12688507
Iteration 305, loss = 0.12518773
Iteration 306, loss = 0.12507342
Iteration 307, loss = 0.12510165
Iteration 308, loss = 0.12449899
Iteration 309, loss = 0.12375685
Iteration 310, loss = 0.12734862
Iteration 311, loss = 0.12557102
Iteration 312, loss = 0.12164985
Iteration 313, loss = 0.13707716
Iteration 314, loss = 0.12867239
Iteration 315, loss = 0.12589512
Iteration 316, loss = 0.13253958
Iteration 317, loss = 0.12346435
Iteration 318, loss = 0.12208080
Iteration 319, loss = 0.12245356
Iteration 320, loss = 0.11961748
Iteration 321, loss = 0.12394614
Iteration 322, loss = 0.12021753
Iteration 323, loss = 0.12164048
Iteration 324, loss = 0.12626440
Iteration 325, loss = 0.12032976
Iteration 326, loss = 0.12178975
Iteration 327, loss = 0.12213310
Iteration 328, loss = 0.11803913
Iteration 329, loss = 0.12114994
Iteration 330, loss = 0.11725703
Iteration 331, loss = 0.12076094
Iteration 332, loss = 0.12026022
Iteration 333, loss = 0.11602327
Iteration 334, loss = 0.11995465
Iteration 335, loss = 0.11718524
Iteration 336, loss = 0.11609747
Iteration 337, loss = 0.11760468
Iteration 338, loss = 0.11490976
Iteration 339, loss = 0.11496963
Iteration 340, loss = 0.11422677
Iteration 341, loss = 0.11375234
Iteration 342, loss = 0.11340566
Iteration 343, loss = 0.11351552
Iteration 344, loss = 0.11356643
Iteration 345, loss = 0.11311543
Iteration 346, loss = 0.11259733
Iteration 347, loss = 0.11124089
Iteration 348, loss = 0.11185655
Iteration 349, loss = 0.11200215
Iteration 350, loss = 0.11118142
Iteration 351, loss = 0.11030712
Iteration 352, loss = 0.11021561
Iteration 353, loss = 0.10982264
Iteration 354, loss = 0.10987060
Iteration 355, loss = 0.10924542
Iteration 356, loss = 0.10947071
Iteration 357, loss = 0.10895485
Iteration 358, loss = 0.10889616
Iteration 359, loss = 0.10804792
Iteration 360, loss = 0.10791342
Iteration 361, loss = 0.10907126
Iteration 362, loss = 0.10781227
Iteration 363, loss = 0.10740163
Iteration 364, loss = 0.10885671
Iteration 365, loss = 0.10664161
Iteration 366, loss = 0.11066026
Iteration 367, loss = 0.10595491
Iteration 368, loss = 0.10856119
Iteration 369, loss = 0.11647613
Iteration 370, loss = 0.10547562
Iteration 371, loss = 0.11319035
Iteration 372, loss = 0.11453445
Iteration 373, loss = 0.10660037
Iteration 374, loss = 0.10672690
Iteration 375, loss = 0.10351444
Iteration 376, loss = 0.10486834
Iteration 377, loss = 0.10980695
Iteration 378, loss = 0.10328256
Iteration 379, loss = 0.10588789
Iteration 380, loss = 0.10731047
Iteration 381, loss = 0.10215224
Iteration 382, loss = 0.10181712
Iteration 383, loss = 0.10125605
Iteration 384, loss = 0.10228608
Iteration 385, loss = 0.10076854
Iteration 386, loss = 0.10212498
Iteration 387, loss = 0.10126292
Iteration 388, loss = 0.10246588
Iteration 389, loss = 0.10508157
Iteration 390, loss = 0.10176577
Iteration 391, loss = 0.10079252
Iteration 392, loss = 0.09880238
Iteration 393, loss = 0.10208954
Iteration 394, loss = 0.09894385
Iteration 395, loss = 0.10188564
Iteration 396, loss = 0.10125334
Iteration 397, loss = 0.09814275
Iteration 398, loss = 0.09847286
Iteration 399, loss = 0.09768785
Iteration 400, loss = 0.09746975
Iteration 401, loss = 0.09705566
Iteration 402, loss = 0.09770101
Iteration 403, loss = 0.09765786
Iteration 404, loss = 0.09823282
Iteration 405, loss = 0.09750841
Iteration 406, loss = 0.09622743
Iteration 407, loss = 0.09536445
Iteration 408, loss = 0.09492604
Iteration 409, loss = 0.09453060
Iteration 410, loss = 0.09465540
Iteration 411, loss = 0.09405552
Iteration 412, loss = 0.09404265
Iteration 413, loss = 0.09435809
Iteration 414, loss = 0.09354805
Iteration 415, loss = 0.09284339
Iteration 416, loss = 0.09329854
Iteration 417, loss = 0.09286583
Iteration 418, loss = 0.09342240
Iteration 419, loss = 0.09173622
Iteration 420, loss = 0.09311403
Iteration 421, loss = 0.09077651
Iteration 422, loss = 0.09321873
Iteration 423, loss = 0.09357174
Iteration 424, loss = 0.09166461
Iteration 425, loss = 0.09352065
Iteration 426, loss = 0.08989276
Iteration 427, loss = 0.09230079
Iteration 428, loss = 0.09229090
Iteration 429, loss = 0.08877926
Iteration 430, loss = 0.09419020
Iteration 431, loss = 0.09059947
Iteration 432, loss = 0.09079556
Iteration 433, loss = 0.09054734
Iteration 434, loss = 0.08770177
Iteration 435, loss = 0.09359509
Iteration 436, loss = 0.08819395
Iteration 437, loss = 0.09153621
Iteration 438, loss = 0.09317750
Iteration 439, loss = 0.08635020
Iteration 440, loss = 0.09001044
Iteration 441, loss = 0.09009227
Iteration 442, loss = 0.08569725
Iteration 443, loss = 0.09009921
Iteration 444, loss = 0.08791455
Iteration 445, loss = 0.08970940
Iteration 446, loss = 0.09254000
Iteration 447, loss = 0.08717615
Iteration 448, loss = 0.08879322
Iteration 449, loss = 0.08509210
Iteration 450, loss = 0.08938803
Iteration 451, loss = 0.08714110
Iteration 452, loss = 0.08362052
Iteration 453, loss = 0.08622018
Iteration 454, loss = 0.08347077
Iteration 455, loss = 0.08590590
Iteration 456, loss = 0.08621327
Iteration 457, loss = 0.08234254
Iteration 458, loss = 0.08418872
Iteration 459, loss = 0.08382589
Iteration 460, loss = 0.08233419
Iteration 461, loss = 0.08205337
Iteration 462, loss = 0.08343911
Iteration 463, loss = 0.08199110
Iteration 464, loss = 0.08120543
Iteration 465, loss = 0.08116426
Iteration 466, loss = 0.08080704
Iteration 467, loss = 0.08046211
Iteration 468, loss = 0.08026456
Iteration 469, loss = 0.08064470
Iteration 470, loss = 0.07995165
Iteration 471, loss = 0.08244801
Iteration 472, loss = 0.08266426
Iteration 473, loss = 0.08347053
Iteration 474, loss = 0.08072649
Iteration 475, loss = 0.07981491
Iteration 476, loss = 0.08162995
Iteration 477, loss = 0.07846417
Iteration 478, loss = 0.08384241
Iteration 479, loss = 0.08154267
Iteration 480, loss = 0.07840825
Iteration 481, loss = 0.08353738
Iteration 482, loss = 0.08160882
Iteration 483, loss = 0.07852621
Iteration 484, loss = 0.07844408
Iteration 485, loss = 0.07728118
Iteration 486, loss = 0.07703614
Iteration 487, loss = 0.07705122
Iteration 488, loss = 0.07667738
Iteration 489, loss = 0.07577607
Iteration 490, loss = 0.07723084
Iteration 491, loss = 0.07596730
Iteration 492, loss = 0.07628094
Iteration 493, loss = 0.07777272
Iteration 494, loss = 0.07465312
Iteration 495, loss = 0.07940364
Iteration 496, loss = 0.08560379
Iteration 497, loss = 0.07329538
Iteration 498, loss = 0.08452327
Iteration 499, loss = 0.08268718
Iteration 500, loss = 0.07561424
Iteration 501, loss = 0.07888863
Iteration 502, loss = 0.07339273
Iteration 503, loss = 0.07714437
Iteration 504, loss = 0.07403349
Iteration 505, loss = 0.07521156
Iteration 506, loss = 0.08397237
Iteration 507, loss = 0.07553793
Iteration 508, loss = 0.07679296
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy ROC AUC Sensitivity Specificity
Logistic Regression 0.952941 0.992431 0.962025 0.938144
Logistic Regression (CV) 0.925490 0.981924 0.948538 0.893864
Logistic Regression (Test) 0.918033 0.970862 0.923077 0.909091
Decision Tree - Initial Train Set 0.862745 0.943886 0.936709 0.742268
Decision Tree - CV Train Set 0.858824 0.924670 0.822785 0.917526
Decision Tree - Test Set 0.786885 0.828089 0.820513 0.727273
Decision Tree - Gini - Limited Depth 0.803279 0.831585 0.948718 0.545455
Decision Tree - Entropy - Limited Depth 0.786885 0.784965 0.897436 0.590909
ANN 0.984314 0.999739 1.000000 0.958763
In [36]:
models = train_neural_network_with_kfolds(X_train, y_train, hidden_layer_sizes=(100,), activation='relu', max_iter=1000)
cv_train_results_ann = evaluate_neural_network(models[-1], X_train, y_train, 'Artificial Neural Network')
results["ANN - CV Train Set"] = cv_train_results_ann


results_df = pd.DataFrame({
    'Logistic Regression': original_train_results,
    'Logistic Regression (CV)': cv_train_results,
    'Logistic Regression (Test)': test_results,
    'Decision Tree - Initial Train Set': initial_train_results_dt,
    'Decision Tree - CV Train Set': cv_train_results_dt,
    'Decision Tree - Test Set': test_results_dt,
    'Decision Tree - Gini - Limited Depth': evaluate_decision_tree(dt_model_gini, X_test, y_test, 'Decision Tree - Gini - Limited Depth'),
    'Decision Tree - Entropy - Limited Depth': evaluate_decision_tree(dt_model_entropy, X_test, y_test, 'Decision Tree - Entropy - Limited Depth'),
    'ANN': initial_train_results_ann,
    'Ann (CV)': cv_train_results_ann
}).T
display(results_df)
Iteration 1, loss = 4.22163960
Iteration 2, loss = 2.23266392
Iteration 3, loss = 0.98520147
Iteration 4, loss = 0.54672483
Iteration 5, loss = 0.66943191
Iteration 6, loss = 0.93194218
Iteration 7, loss = 1.02349321
Iteration 8, loss = 0.96440416
Iteration 9, loss = 0.79275934
Iteration 10, loss = 0.62363374
Iteration 11, loss = 0.56077809
Iteration 12, loss = 0.57959426
Iteration 13, loss = 0.61640017
Iteration 14, loss = 0.64929655
Iteration 15, loss = 0.65909269
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 4.22354979
Iteration 2, loss = 2.25010972
Iteration 3, loss = 0.97161520
Iteration 4, loss = 0.58184682
Iteration 5, loss = 0.69279621
Iteration 6, loss = 0.80113225
Iteration 7, loss = 0.81207305
Iteration 8, loss = 0.73692669
Iteration 9, loss = 0.62985174
Iteration 10, loss = 0.58411131
Iteration 11, loss = 0.59810975
Iteration 12, loss = 0.62565651
Iteration 13, loss = 0.63808097
Iteration 14, loss = 0.62662879
Iteration 15, loss = 0.59222289
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 4.40308327
Iteration 2, loss = 2.35128481
Iteration 3, loss = 0.96430909
Iteration 4, loss = 0.58045854
Iteration 5, loss = 0.67514544
Iteration 6, loss = 0.82212449
Iteration 7, loss = 0.88910858
Iteration 8, loss = 0.81014449
Iteration 9, loss = 0.66751640
Iteration 10, loss = 0.57471490
Iteration 11, loss = 0.57928092
Iteration 12, loss = 0.63068796
Iteration 13, loss = 0.65174381
Iteration 14, loss = 0.62220160
Iteration 15, loss = 0.56242003
Iteration 16, loss = 0.52825348
Iteration 17, loss = 0.55181935
Iteration 18, loss = 0.62242423
Iteration 19, loss = 0.67718384
Iteration 20, loss = 0.68012165
Iteration 21, loss = 0.61506530
Iteration 22, loss = 0.53279130
Iteration 23, loss = 0.49080675
Iteration 24, loss = 0.48420794
Iteration 25, loss = 0.49438687
Iteration 26, loss = 0.51639462
Iteration 27, loss = 0.51440801
Iteration 28, loss = 0.48609095
Iteration 29, loss = 0.47502913
Iteration 30, loss = 0.47100567
Iteration 31, loss = 0.46709770
Iteration 32, loss = 0.47838106
Iteration 33, loss = 0.50439703
Iteration 34, loss = 0.46787340
Iteration 35, loss = 0.40034390
Iteration 36, loss = 0.39182700
Iteration 37, loss = 0.40217608
Iteration 38, loss = 0.38545705
Iteration 39, loss = 0.36846385
Iteration 40, loss = 0.36162610
Iteration 41, loss = 0.36822233
Iteration 42, loss = 0.38895401
Iteration 43, loss = 0.36801980
Iteration 44, loss = 0.35042417
Iteration 45, loss = 0.34514174
Iteration 46, loss = 0.33672332
Iteration 47, loss = 0.33139784
Iteration 48, loss = 0.32938864
Iteration 49, loss = 0.33058718
Iteration 50, loss = 0.34757384
Iteration 51, loss = 0.35010392
Iteration 52, loss = 0.32402906
Iteration 53, loss = 0.35111728
Iteration 54, loss = 0.39375771
Iteration 55, loss = 0.36079342
Iteration 56, loss = 0.31791175
Iteration 57, loss = 0.32537116
Iteration 58, loss = 0.32398032
Iteration 59, loss = 0.31360946
Iteration 60, loss = 0.31106698
Iteration 61, loss = 0.31525581
Iteration 62, loss = 0.33791280
Iteration 63, loss = 0.34376663
Iteration 64, loss = 0.31549679
Iteration 65, loss = 0.30202004
Iteration 66, loss = 0.30222907
Iteration 67, loss = 0.32165497
Iteration 68, loss = 0.35633922
Iteration 69, loss = 0.35763595
Iteration 70, loss = 0.30702865
Iteration 71, loss = 0.30646405
Iteration 72, loss = 0.41109411
Iteration 73, loss = 0.38481325
Iteration 74, loss = 0.29196668
Iteration 75, loss = 0.35370177
Iteration 76, loss = 0.49051039
Iteration 77, loss = 0.51066222
Iteration 78, loss = 0.37842278
Iteration 79, loss = 0.29594399
Iteration 80, loss = 0.33059990
Iteration 81, loss = 0.35648972
Iteration 82, loss = 0.31689687
Iteration 83, loss = 0.30439458
Iteration 84, loss = 0.34936877
Iteration 85, loss = 0.39807018
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 4.21620790
Iteration 2, loss = 2.24222414
Iteration 3, loss = 0.93538134
Iteration 4, loss = 0.61455033
Iteration 5, loss = 0.86172328
Iteration 6, loss = 1.10095280
Iteration 7, loss = 1.13128692
Iteration 8, loss = 1.05086702
Iteration 9, loss = 0.91318579
Iteration 10, loss = 0.76235227
Iteration 11, loss = 0.66191068
Iteration 12, loss = 0.63935631
Iteration 13, loss = 0.69674533
Iteration 14, loss = 0.81829953
Iteration 15, loss = 0.93396626
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 3.92051852
Iteration 2, loss = 2.13573007
Iteration 3, loss = 0.99513238
Iteration 4, loss = 0.68912846
Iteration 5, loss = 0.90766105
Iteration 6, loss = 1.15997858
Iteration 7, loss = 1.12201840
Iteration 8, loss = 0.91680068
Iteration 9, loss = 0.76407906
Iteration 10, loss = 0.71199174
Iteration 11, loss = 0.73502274
Iteration 12, loss = 0.77686128
Iteration 13, loss = 0.80032925
Iteration 14, loss = 0.79393632
Iteration 15, loss = 0.77233785
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Accuracy ROC AUC Sensitivity Specificity
Logistic Regression 0.952941 0.992431 0.962025 0.938144
Logistic Regression (CV) 0.925490 0.981924 0.948538 0.893864
Logistic Regression (Test) 0.918033 0.970862 0.923077 0.909091
Decision Tree - Initial Train Set 0.862745 0.943886 0.936709 0.742268
Decision Tree - CV Train Set 0.858824 0.924670 0.822785 0.917526
Decision Tree - Test Set 0.786885 0.828089 0.820513 0.727273
Decision Tree - Gini - Limited Depth 0.803279 0.831585 0.948718 0.545455
Decision Tree - Entropy - Limited Depth 0.786885 0.784965 0.897436 0.590909
ANN 0.984314 0.999739 1.000000 0.958763
Ann (CV) 0.780392 0.865718 0.784810 0.773196
In [37]:
test_results_ann = evaluate_neural_network(models[-1], X_test, y_test, 'ANN - Test Set')
results["ANN - Test Set"] = test_results_ann


results_df = pd.DataFrame({
    'Logistic Regression': original_train_results,
    'Logistic Regression (CV)': cv_train_results,
    'Logistic Regression (Test)': test_results,
    'Decision Tree': initial_train_results_dt,
    'Decision Tree (CV)': cv_train_results_dt,
    'Decision Tree (Test)': test_results_dt,
    'Decision Tree - Gini - Limited Depth': evaluate_decision_tree(dt_model_gini, X_test, y_test, 'Decision Tree - Gini - Limited Depth'),
    'Decision Tree - Entropy - Limited Depth': evaluate_decision_tree(dt_model_entropy, X_test, y_test, 'Decision Tree - Entropy - Limited Depth'),
    'ANN': initial_train_results_ann,
    'ANN (CV)': cv_train_results_ann,
    'ANN (Test)': test_results_ann
}).T

display(results_df)
Accuracy ROC AUC Sensitivity Specificity
Logistic Regression 0.952941 0.992431 0.962025 0.938144
Logistic Regression (CV) 0.925490 0.981924 0.948538 0.893864
Logistic Regression (Test) 0.918033 0.970862 0.923077 0.909091
Decision Tree 0.862745 0.943886 0.936709 0.742268
Decision Tree (CV) 0.858824 0.924670 0.822785 0.917526
Decision Tree (Test) 0.786885 0.828089 0.820513 0.727273
Decision Tree - Gini - Limited Depth 0.803279 0.831585 0.948718 0.545455
Decision Tree - Entropy - Limited Depth 0.786885 0.784965 0.897436 0.590909
ANN 0.984314 0.999739 1.000000 0.958763
ANN (CV) 0.780392 0.865718 0.784810 0.773196
ANN (Test) 0.737705 0.770396 0.717949 0.772727
In [38]:
y_pred_lr = initial_lr_model.predict(X_test)
y_pred_lr_cv = cv_lr_model.predict(X_test)

y_pred_dt = dt_initial.predict(X_test)
y_pred_dt_cv = dt_cv.predict(X_test)

y_pred_dt_gini = dt_model_gini.predict(X_test)
y_pred_dt_entropy = dt_model_entropy.predict(X_test)

y_pred_ann = ann_model.predict(X_test)
y_pred_ann_cv = models[-1].predict(X_test)

precision_scores = {
    "Logistic Regression": precision_score(y_test, y_pred_lr),
    "Logistic Regression - CV": precision_score(y_test, y_pred_lr_cv),
    "Decision Tree": precision_score(y_test, y_pred_dt),
    "Decision Tree - CV": precision_score(y_test, y_pred_dt_cv),
    "Decision Tree - Gini - Limited Depth": precision_score(y_test, y_pred_dt_gini),
    "Decision Tree - Entropy - Limited Depth": precision_score(y_test, y_pred_dt_entropy),
    "Artificial Neural Network": precision_score(y_test, y_pred_ann),
    "Artificial Neural Network - CV": precision_score(y_test, y_pred_ann_cv)
}
model_names = list(precision_scores.keys())
precision_values = list(precision_scores.values())
norm = plt.Normalize(min(precision_values), max(precision_values))
colors = [plt.cm.Greens(norm(value)) for value in precision_values]
plt.figure(figsize=(10, 8))
bar = plt.bar(model_names, precision_values, color=colors)
plt.xlabel('Model')
plt.ylabel('Precision Score')
plt.title('Comparison of Model Precision Scores')
plt.xticks(rotation=45, fontsize=6)
sm = cm.ScalarMappable(cmap=plt.cm.Greens, norm=norm)
sm.set_array([])
plt.colorbar(sm, label='Precision Score')
plt.tight_layout()
plt.show()
Out[38]:
<Figure size 1000x800 with 0 Axes>
Out[38]:
Text(0.5, 0, 'Model')
Out[38]:
Text(0, 0.5, 'Precision Score')
Out[38]:
Text(0.5, 1.0, 'Comparison of Model Precision Scores')
Out[38]:
([0, 1, 2, 3, 4, 5, 6, 7],
 [Text(0, 0, 'Logistic Regression'),
  Text(1, 0, 'Logistic Regression - CV'),
  Text(2, 0, 'Decision Tree'),
  Text(3, 0, 'Decision Tree - CV'),
  Text(4, 0, 'Decision Tree - Gini - Limited Depth'),
  Text(5, 0, 'Decision Tree - Entropy - Limited Depth'),
  Text(6, 0, 'Artificial Neural Network'),
  Text(7, 0, 'Artificial Neural Network - CV')])
<ipython-input-38-c9fb05c09afa>:35: MatplotlibDeprecationWarning: Unable to determine Axes to steal space for Colorbar. Using gca(), but will raise in the future. Either provide the *cax* argument to use as the Axes for the Colorbar, provide the *ax* argument to steal space from it, or add *mappable* to an Axes.
  plt.colorbar(sm, label='Precision Score')
Out[38]:
<matplotlib.colorbar.Colorbar at 0x7ddd520bc190>
In [39]:
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
In [40]:
! pwd
/content
In [41]:
%%shell
jupyter nbconvert --to html //content/Breast_Cancer_Survival_Prediction_01.ipynb
[NbConvertApp] WARNING | pattern '//content/Breast_Cancer_Survival_Prediction_01.ipynb' matched no files
This application is used to convert notebook files (*.ipynb)
        to various other formats.

        WARNING: THE COMMANDLINE INTERFACE MAY CHANGE IN FUTURE RELEASES.

Options
=======
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePreprocessor.enabled=True]
--allow-errors
    Continue notebook execution even if one of the cells throws an error and include the error message in the cell output (the default behaviour is to abort conversion). This flag is only relevant if '--execute' was specified, too.
    Equivalent to: [--ExecutePreprocessor.allow_errors=True]
--stdin
    read a single notebook file from stdin. Write the resulting notebook with default basename 'notebook.*'
    Equivalent to: [--NbConvertApp.from_stdin=True]
--stdout
    Write notebook output to stdout instead of files.
    Equivalent to: [--NbConvertApp.writer_class=StdoutWriter]
--inplace
    Run nbconvert in place, overwriting the existing notebook (only
            relevant when converting to notebook format)
    Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory=]
--clear-output
    Clear output of current file and save in place,
            overwriting the existing notebook.
    Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory= --ClearOutputPreprocessor.enabled=True]
--no-prompt
    Exclude input and output prompts from converted document.
    Equivalent to: [--TemplateExporter.exclude_input_prompt=True --TemplateExporter.exclude_output_prompt=True]
--no-input
    Exclude input cells and output prompts from converted document.
            This mode is ideal for generating code-free reports.
    Equivalent to: [--TemplateExporter.exclude_output_prompt=True --TemplateExporter.exclude_input=True --TemplateExporter.exclude_input_prompt=True]
--allow-chromium-download
    Whether to allow downloading chromium if no suitable version is found on the system.
    Equivalent to: [--WebPDFExporter.allow_chromium_download=True]
--disable-chromium-sandbox
    Disable chromium security sandbox when converting to PDF..
    Equivalent to: [--WebPDFExporter.disable_sandbox=True]
--show-input
    Shows code input. This flag is only useful for dejavu users.
    Equivalent to: [--TemplateExporter.exclude_input=False]
--embed-images
    Embed the images as base64 dataurls in the output. This flag is only useful for the HTML/WebPDF/Slides exports.
    Equivalent to: [--HTMLExporter.embed_images=True]
--sanitize-html
    Whether the HTML in Markdown cells and cell outputs should be sanitized..
    Equivalent to: [--HTMLExporter.sanitize_html=True]
--log-level=<Enum>
    Set the log level by value or name.
    Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
    Default: 30
    Equivalent to: [--Application.log_level]
--config=<Unicode>
    Full path of a config file.
    Default: ''
    Equivalent to: [--JupyterApp.config_file]
--to=<Unicode>
    The export format to be used, either one of the built-in formats
            ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf']
            or a dotted object name that represents the import path for an
            ``Exporter`` class
    Default: ''
    Equivalent to: [--NbConvertApp.export_format]
--template=<Unicode>
    Name of the template to use
    Default: ''
    Equivalent to: [--TemplateExporter.template_name]
--template-file=<Unicode>
    Name of the template file to use
    Default: None
    Equivalent to: [--TemplateExporter.template_file]
--theme=<Unicode>
    Template specific theme(e.g. the name of a JupyterLab CSS theme distributed
    as prebuilt extension for the lab template)
    Default: 'light'
    Equivalent to: [--HTMLExporter.theme]
--sanitize_html=<Bool>
    Whether the HTML in Markdown cells and cell outputs should be sanitized.This
    should be set to True by nbviewer or similar tools.
    Default: False
    Equivalent to: [--HTMLExporter.sanitize_html]
--writer=<DottedObjectName>
    Writer class used to write the
                                        results of the conversion
    Default: 'FilesWriter'
    Equivalent to: [--NbConvertApp.writer_class]
--post=<DottedOrNone>
    PostProcessor class used to write the
                                        results of the conversion
    Default: ''
    Equivalent to: [--NbConvertApp.postprocessor_class]
--output=<Unicode>
    overwrite base name use for output files.
                can only be used when converting one notebook at a time.
    Default: ''
    Equivalent to: [--NbConvertApp.output_base]
--output-dir=<Unicode>
    Directory to write output(s) to. Defaults
                                  to output to the directory of each notebook. To recover
                                  previous default behaviour (outputting to the current
                                  working directory) use . as the flag value.
    Default: ''
    Equivalent to: [--FilesWriter.build_directory]
--reveal-prefix=<Unicode>
    The URL prefix for reveal.js (version 3.x).
            This defaults to the reveal CDN, but can be any url pointing to a copy
            of reveal.js.
            For speaker notes to work, this must be a relative path to a local
            copy of reveal.js: e.g., "reveal.js".
            If a relative path is given, it must be a subdirectory of the
            current directory (from which the server is run).
            See the usage documentation
            (https://nbconvert.readthedocs.io/en/latest/usage.html#reveal-js-html-slideshow)
            for more details.
    Default: ''
    Equivalent to: [--SlidesExporter.reveal_url_prefix]
--nbformat=<Enum>
    The nbformat version to write.
            Use this to downgrade notebooks.
    Choices: any of [1, 2, 3, 4]
    Default: 4
    Equivalent to: [--NotebookExporter.nbformat_version]

Examples
--------

    The simplest way to use nbconvert is

            > jupyter nbconvert mynotebook.ipynb --to html

            Options include ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf'].

            > jupyter nbconvert --to latex mynotebook.ipynb

            Both HTML and LaTeX support multiple output templates. LaTeX includes
            'base', 'article' and 'report'.  HTML includes 'basic', 'lab' and
            'classic'. You can specify the flavor of the format used.

            > jupyter nbconvert --to html --template lab mynotebook.ipynb

            You can also pipe the output to stdout, rather than a file

            > jupyter nbconvert mynotebook.ipynb --stdout

            PDF is generated via latex

            > jupyter nbconvert mynotebook.ipynb --to pdf

            You can get (and serve) a Reveal.js-powered slideshow

            > jupyter nbconvert myslides.ipynb --to slides --post serve

            Multiple notebooks can be given at the command line in a couple of
            different ways:

            > jupyter nbconvert notebook*.ipynb
            > jupyter nbconvert notebook1.ipynb notebook2.ipynb

            or you can specify the notebooks list in a config file, containing::

                c.NbConvertApp.notebooks = ["my_notebook.ipynb"]

            > jupyter nbconvert --config mycfg.py

To see all available configurables, use `--help-all`.

---------------------------------------------------------------------------
CalledProcessError                        Traceback (most recent call last)
<ipython-input-41-4ad7f0406dcb> in <cell line: 1>()
----> 1 get_ipython().run_cell_magic('shell', '', 'jupyter nbconvert --to html //content/Breast_Cancer_Survival_Prediction_01.ipynb\n')

/usr/local/lib/python3.10/dist-packages/google/colab/_shell.py in run_cell_magic(self, magic_name, line, cell)
    332     if line and not cell:
    333       cell = ' '
--> 334     return super().run_cell_magic(magic_name, line, cell)
    335 
    336 

/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py in run_cell_magic(self, magic_name, line, cell)
   2471             with self.builtin_trap:
   2472                 args = (magic_arg_s, cell)
-> 2473                 result = fn(*args, **kwargs)
   2474             return result
   2475 

/usr/local/lib/python3.10/dist-packages/google/colab/_system_commands.py in _shell_cell_magic(args, cmd)
    110   result = _run_command(cmd, clear_streamed_output=False)
    111   if not parsed_args.ignore_errors:
--> 112     result.check_returncode()
    113   return result
    114 

/usr/local/lib/python3.10/dist-packages/google/colab/_system_commands.py in check_returncode(self)
    135   def check_returncode(self):
    136     if self.returncode:
--> 137       raise subprocess.CalledProcessError(
    138           returncode=self.returncode, cmd=self.args, output=self.output
    139       )

CalledProcessError: Command 'jupyter nbconvert --to html //content/Breast_Cancer_Survival_Prediction_01.ipynb
' returned non-zero exit status 255.